{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "z7U8tcMUo7ah"
      },
      "source": [
        "## Setup:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "eY0TYzM70-cE",
        "outputId": "d9a31d1b-d0b5-4be9-dbd4-325469023861"
      },
      "outputs": [],
      "source": [
        "!git clone https://github.com/mat10d/EvolvePro.git\n",
        "%cd EvolvePro/"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": true,
        "id": "Nxum_v_ieBlO"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "\n",
        "!pip install pandas numpy scikit-learn scikit-learn-extra xgboost matplotlib seaborn biopython scipy torch fair-esm\n",
        "!mkdir /content/output"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MPYtN9W5o-UD"
      },
      "source": [
        "## Process"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xZVqxVCXftSE",
        "outputId": "89d68d99-370c-45db-b593-c36b8f31e945"
      },
      "outputs": [],
      "source": [
        "from evolvepro.src.process import generate_wt, generate_single_aa_mutants\n",
        "generate_wt('MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPYDLSKGRIVFRSR', output_file='/content/output/kelsic_WT.fasta')\n",
        "generate_single_aa_mutants('/content/output/kelsic_WT.fasta', output_file='/content/output/kelsic.fasta')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hhtBO0g2gu2R",
        "outputId": "587b3178-3d78-4e2f-fd69-d27658313851"
      },
      "outputs": [],
      "source": [
        "from evolvepro.src.process import suggest_initial_mutants\n",
        "suggest_initial_mutants('/content/output/kelsic.fasta', num_mutants=12, random_seed=42)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eXNaQq1IpBBV"
      },
      "source": [
        "## PLM"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ASOkwC-nlGkN",
        "outputId": "5b4a904c-4142-41af-cfdd-3fe47b221e83"
      },
      "outputs": [],
      "source": [
        "!python evolvepro/plm/esm/extract.py esm1b_t33_650M_UR50S /content/output/kelsic.fasta /content/output/kelsic_esm1b_t33_650M_UR50S --toks_per_batch 512 --include mean --concatenate_dir /content/output"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5cYm9CwunCUr"
      },
      "source": [
        "## Run EVOLVEpro"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UVmQ1CruxxQr"
      },
      "outputs": [],
      "source": [
        "from evolvepro.src.evolve import evolve_experimental\n",
        "\n",
        "protein_name = 'kelsic'\n",
        "embeddings_base_path = '/content/output'\n",
        "embeddings_file_name = 'kelsic_esm1b_t33_650M_UR50S.csv'\n",
        "round_base_path = '/content/EvolvePro/colab/rounds_data'\n",
        "wt_fasta_path = \"/content/output/kelsic_WT.fasta\"\n",
        "number_of_variants = 12\n",
        "output_dir = '/content/output/'\n",
        "rename_WT = False"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Mtt0jtAm1iw2"
      },
      "source": [
        "#### Round 1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XYcxk5M5ot4m",
        "outputId": "f2b9c98e-53bf-44de-9c7b-196746743b64"
      },
      "outputs": [],
      "source": [
        "round_name = 'Round1'\n",
        "round_file_names = ['kelsic_Round1.xlsx']\n",
        "\n",
        "this_round_variants, df_test, df_sorted_all = evolve_experimental(\n",
        "    protein_name,\n",
        "    round_name,\n",
        "    embeddings_base_path,\n",
        "    embeddings_file_name,\n",
        "    round_base_path,\n",
        "    round_file_names,\n",
        "    wt_fasta_path,\n",
        "    rename_WT,\n",
        "    number_of_variants,\n",
        "    output_dir\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hp1LcEwY1nrS"
      },
      "source": [
        "#### Round 2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "twLZxKaOyj4X",
        "outputId": "1f938417-bf1f-4294-9b0b-900ca7b1dcb2"
      },
      "outputs": [],
      "source": [
        "round_name = 'Round2'\n",
        "round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx']\n",
        "\n",
        "this_round_variants, df_test, df_sorted_all = evolve_experimental(\n",
        "    protein_name,\n",
        "    round_name,\n",
        "    embeddings_base_path,\n",
        "    embeddings_file_name,\n",
        "    round_base_path,\n",
        "    round_file_names,\n",
        "    wt_fasta_path,\n",
        "    rename_WT,\n",
        "    number_of_variants,\n",
        "    output_dir\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "B4EPm84C1pS8"
      },
      "source": [
        "#### Round 3"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kz5fPRMJy7Dk",
        "outputId": "6d32c010-09a5-47f7-ba76-e133e5f86817"
      },
      "outputs": [],
      "source": [
        "round_name = 'Round3'\n",
        "round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx']\n",
        "\n",
        "this_round_variants, df_test, df_sorted_all = evolve_experimental(\n",
        "    protein_name,\n",
        "    round_name,\n",
        "    embeddings_base_path,\n",
        "    embeddings_file_name,\n",
        "    round_base_path,\n",
        "    round_file_names,\n",
        "    wt_fasta_path,\n",
        "    rename_WT,\n",
        "    number_of_variants,\n",
        "    output_dir\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uzKvCWGX1qxB"
      },
      "source": [
        "#### Round 4"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "w8zWK7IlzOwB",
        "outputId": "42414526-e9a1-438e-b6a1-2adae7a8b627"
      },
      "outputs": [],
      "source": [
        "round_name = 'Round4'\n",
        "round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx', 'kelsic_Round4.xlsx']\n",
        "\n",
        "this_round_variants, df_test, df_sorted_all = evolve_experimental(\n",
        "    protein_name,\n",
        "    round_name,\n",
        "    embeddings_base_path,\n",
        "    embeddings_file_name,\n",
        "    round_base_path,\n",
        "    round_file_names,\n",
        "    wt_fasta_path,\n",
        "    rename_WT,\n",
        "    number_of_variants,\n",
        "    output_dir\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qDPcYiLY1tZl"
      },
      "source": [
        "#### Round 5"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nruFZ5a4ziFl",
        "outputId": "ce3497fa-66af-4fdc-9636-1b68f9c8af9a"
      },
      "outputs": [],
      "source": [
        "round_name = 'Round5'\n",
        "round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx', 'kelsic_Round4.xlsx', 'kelsic_Round5.xlsx']\n",
        "\n",
        "this_round_variants, df_test, df_sorted_all = evolve_experimental(\n",
        "    protein_name,\n",
        "    round_name,\n",
        "    embeddings_base_path,\n",
        "    embeddings_file_name,\n",
        "    round_base_path,\n",
        "    round_file_names,\n",
        "    wt_fasta_path,\n",
        "    rename_WT,\n",
        "    number_of_variants,\n",
        "    output_dir\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "g0YzCWOL1xdF"
      },
      "source": [
        "## Plot"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "46D0aZra2ffS"
      },
      "outputs": [],
      "source": [
        "from evolvepro.src.plot import read_exp_data, plot_variants_by_iteration\n",
        "\n",
        "round_base_path = '/content/EvolvePro/colab/rounds_data'\n",
        "round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx', 'kelsic_Round4.xlsx', 'kelsic_Round5.xlsx']\n",
        "wt_fasta_path = \"/content/output/kelsic_WT.fasta\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 607
        },
        "id": "KyYpI7pr2a1i",
        "outputId": "e66ff38a-0460-49f3-fb19-fa372c14b465"
      },
      "outputs": [],
      "source": [
        "df = read_exp_data(round_base_path, round_file_names, wt_fasta_path)\n",
        "plot_variants_by_iteration(df, activity_column='activity', output_dir=output_dir, output_file=\"kelsic\")\n"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
